{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "original-wedding",
   "metadata": {},
   "source": [
    "# STSI demo using Sentinel 2 metadata for the self-intersection task\n",
    "# STSI used temporal slicing index\n",
    "## This demo could be used to find Spatio-temporal intersection for Sentinel-2 and itself"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "polar-techno",
   "metadata": {},
   "source": [
    "## please input the potential area of interest, we have \"Beaufort_Sea\" and \"Wandel_Sea\" in in this demo, could leave blank if not using AOI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "sustained-mississippi",
   "metadata": {},
   "outputs": [],
   "source": [
    "### test temporal index\n",
    "### test self interact for Sentinel-2 data\n",
    "## the basic idea is to generate a temporal id, and classify data into categories according to time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "living-african",
   "metadata": {},
   "outputs": [],
   "source": [
    "# date in the format of \"yyy-MM-dd\", results include both start date and end date\n",
    "\n",
    "# The hour delay variable represents how many hours apart the two datasets are considered to be temporally intersected\n",
    "# E.g. if hour_delay = 6, for 2020-06-26 12:00:00, timestamps between 2020-06-26 06:00:00 and 2020-06-26 18:00:00 are considered to be temporally intersected\n",
    "start_date = '2020-10-01'\n",
    "end_date = '2020-12-31'\n",
    "time_index_hour_length = 24\n",
    "hour_delay = 6 ### test hour for 4 days, \n",
    "duration_month = 3 # for export"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "spoken-basis",
   "metadata": {},
   "source": [
    "## please input the potential area of interest, we have \"Beaufort_Sea\" and \"Wandel_Sea\" in in this demo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "public-coral",
   "metadata": {},
   "outputs": [],
   "source": [
    "Paoi = 'Wandel_Sea'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "attended-copying",
   "metadata": {},
   "source": [
    "## First, setup the Spark and Sedona environment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "featured-gospel",
   "metadata": {},
   "outputs": [],
   "source": [
    "import findspark\n",
    "#findspark.init() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "double-history",
   "metadata": {},
   "outputs": [],
   "source": [
    "SPARK_HOME='/opt/cloudera/parcels/CDH/lib/spark'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "median-university",
   "metadata": {},
   "outputs": [],
   "source": [
    "findspark.init(SPARK_HOME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "geological-anime",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/zhangp/.conda/envs/py37_environment/lib/python3.7/site-packages/geopandas/_compat.py:58: UserWarning: The installed version of PyGEOS is too old (0.6 installed, 0.8 required), and thus GeoPandas will not use PyGEOS.\n",
      "  UserWarning,\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import os\n",
    "import codecs\n",
    "import subprocess\n",
    "#from hdfs import InsecureClient\n",
    "import numpy as np\n",
    "#from pyspark import SparkContext\n",
    "from pyspark import SQLContext\n",
    "from pyspark.sql import Row\n",
    "from pyspark.sql import functions as F\n",
    "from pyspark.sql.types import *\n",
    "import rtree\n",
    "from pyspark.sql import Window\n",
    "#import igraph\n",
    "#from igraph import Graph\n",
    "import geofeather"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "indonesian-garage",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "from pyspark import StorageLevel\n",
    "import geopandas as gpd\n",
    "import pandas as pd\n",
    "from pyspark.sql.types import StructType\n",
    "from pyspark.sql.types import StructField\n",
    "from pyspark.sql.types import StringType\n",
    "from pyspark.sql.types import LongType\n",
    "from shapely.geometry import Point\n",
    "from shapely.geometry import Polygon\n",
    "\n",
    "from sedona.register import SedonaRegistrator\n",
    "from sedona.core.SpatialRDD import SpatialRDD\n",
    "from sedona.core.SpatialRDD import PointRDD\n",
    "from sedona.core.SpatialRDD import PolygonRDD\n",
    "from sedona.core.SpatialRDD import LineStringRDD\n",
    "from sedona.core.enums import FileDataSplitter\n",
    "from sedona.utils.adapter import Adapter\n",
    "from sedona.core.spatialOperator import KNNQuery\n",
    "from sedona.core.spatialOperator import JoinQuery\n",
    "from sedona.core.spatialOperator import JoinQueryRaw\n",
    "from sedona.core.spatialOperator import RangeQuery\n",
    "from sedona.core.spatialOperator import RangeQueryRaw\n",
    "from sedona.core.formatMapper.shapefileParser import ShapefileReader\n",
    "from sedona.core.formatMapper import WkbReader\n",
    "from sedona.core.formatMapper import WktReader\n",
    "from sedona.core.formatMapper import GeoJsonReader\n",
    "from sedona.sql.types import GeometryType\n",
    "from sedona.core.enums import GridType\n",
    "from sedona.core.SpatialRDD import RectangleRDD\n",
    "from sedona.core.enums import IndexType\n",
    "from sedona.core.geom.envelope import Envelope\n",
    "from sedona.utils import SedonaKryoRegistrator, KryoSerializer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "revised-point",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ['PYSPARK_PYTHON'] = \"./environment/bin/python\"\n",
    "os.environ['YARN_CONF_DIR'] = \"/opt/cloudera/parcels/CDH/lib/spark/conf/yarn-conf\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "essential-radiation",
   "metadata": {},
   "outputs": [],
   "source": [
    "spark = SparkSession \\\n",
    ".builder \\\n",
    ".appName(\"test_test_1\") \\\n",
    ".master('yarn') \\\n",
    ".config(\"spark.serializer\", KryoSerializer.getName) \\\n",
    ".config(\"spark.kryo.registrator\", SedonaKryoRegistrator.getName) \\\n",
    ".config('spark.jars','sedona-core-2.4_2.11-1.0.0-incubating.jar,sedona-sql-2.4_2.11-1.0.0-incubating.jar,sedona-python-adapter-2.4_2.11-1.0.0-incubating.jar,sedona-viz-2.4_2.11-1.0.0-incubating.jar,geotools-wrapper-geotools-24.0.jar') \\\n",
    ".config('spark.executor.memory', '20g') \\\n",
    ".config('spark.driver.memory', '10g') \\\n",
    ".config('spark.sql.shuffle.partitions', 6144) \\\n",
    ".config('spark.executor.instances', '24') \\\n",
    ".config('spark.executor.cores', '5') \\\n",
    ".config('spark.rpc.message.maxSize', '1024') \\\n",
    ".config('spark.yarn.dist.archives', 'environment.tar.gz#environment') \\\n",
    ".getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "supreme-strategy",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "SedonaRegistrator.registerAll(spark)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "conditional-broadway",
   "metadata": {},
   "source": [
    "## Second, read datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "alpha-uzbekistan",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract data within potential time slots\n",
    "import time\n",
    "import datetime\n",
    "\n",
    "start_timestamp = time.mktime(datetime.datetime.strptime(start_date, \"%Y-%m-%d\").timetuple())\n",
    "start_timestamp = start_timestamp - hour_delay * 3600 # for potential time delay\n",
    "\n",
    "end_timestamp = time.mktime(datetime.datetime.strptime(end_date, \"%Y-%m-%d\").timetuple())\n",
    "end_timestamp = end_timestamp + hour_delay * 3600 # for potential time delay"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eastern-steal",
   "metadata": {},
   "source": [
    "### read Sentinel data, Spark have different types of input methods, here we read from Hadoop file system, as our Spark 2.4 is built based on Hadoop Yarn\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "surface-shore",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "sentinel_df = spark.read.option(\"header\",True).options(delimiter='|').csv(\"meta_s2_2020_wkt_cloudcover.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "nuclear-stock",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- _c0: string (nullable = true)\n",
      " |-- s2_index: string (nullable = true)\n",
      " |-- timestamp: string (nullable = true)\n",
      " |-- cloudcover_assessment: string (nullable = true)\n",
      " |-- wkt_geo: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_df.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "offensive-soldier",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "universal-solomon",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+--------------------+--------------------+---------------------+--------------------+\n",
      "|_c0|            s2_index|           timestamp|cloudcover_assessment|             wkt_geo|\n",
      "+---+--------------------+--------------------+---------------------+--------------------+\n",
      "|  0|20200930T235749_2...|2020-10-01T00:00:...|            93.659861|POLYGON ((154.925...|\n",
      "|  1|20200930T235749_2...|2020-10-01T00:00:...|            99.919739|POLYGON ((155.851...|\n",
      "|  2|20200930T235749_2...|2020-10-01T00:00:...|             98.79224|POLYGON ((154.841...|\n",
      "|  3|20200930T235749_2...|2020-10-01T00:00:...|            88.397203|POLYGON ((156.773...|\n",
      "|  4|20200930T235749_2...|2020-10-01T00:00:...|            93.500239|POLYGON ((155.752...|\n",
      "|  5|20200930T235749_2...|2020-10-01T00:00:...|            88.250346|POLYGON ((156.870...|\n",
      "|  6|20200930T235749_2...|2020-10-01T00:00:...|            81.360003|POLYGON ((156.237...|\n",
      "|  7|20201001T000249_2...|2020-10-01T00:09:...|            11.674878|POLYGON ((145.020...|\n",
      "|  8|20201001T000249_2...|2020-10-01T00:08:...|            32.067495|POLYGON ((145.605...|\n",
      "|  9|20201001T000249_2...|2020-10-01T00:08:...|            46.908053|POLYGON ((145.917...|\n",
      "| 10|20201001T000249_2...|2020-10-01T00:09:...|             5.032759|POLYGON ((146.714...|\n",
      "| 11|20201001T000249_2...|2020-10-01T00:09:...|            21.563988|POLYGON ((147.122...|\n",
      "| 12|20201001T000249_2...|2020-10-01T00:08:...|            46.979577|POLYGON ((145.783...|\n",
      "| 13|20201001T000249_2...|2020-10-01T00:08:...|            41.197359|POLYGON ((147.116...|\n",
      "| 14|20201001T000249_2...|2020-10-01T00:08:...|            11.354851|POLYGON ((147.115...|\n",
      "| 15|20201001T000249_2...|2020-10-01T00:08:...|            11.036312|POLYGON ((146.241...|\n",
      "| 16|20201001T000249_2...|2020-10-01T00:09:...|             14.95059|POLYGON ((146.999...|\n",
      "| 17|20201001T000249_2...|2020-10-01T00:09:...|            18.875763|POLYGON ((148.354...|\n",
      "| 18|20201001T000249_2...|2020-10-01T00:08:...|            24.837191|POLYGON ((146.999...|\n",
      "| 19|20201001T000249_2...|2020-10-01T00:08:...|            27.403476|POLYGON ((146.999...|\n",
      "+---+--------------------+--------------------+---------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_df.show(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "advance-intermediate",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql.types import DoubleType\n",
    "sentinel_df = sentinel_df.withColumn('date_date_type', F.to_timestamp(F.udf(lambda x: x[:10] + ' ' + x[11:19])(F.col('timestamp')), 'yyyy-MM-dd HH:mm:ss')).\\\n",
    "withColumn('cloudcover_assessment', F.col('cloudcover_assessment').cast(DoubleType()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "imposed-communist",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "dominican-investigation",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+--------------------+--------------------+---------------------+--------------------+-------------------+\n",
      "|_c0|            s2_index|           timestamp|cloudcover_assessment|             wkt_geo|     date_date_type|\n",
      "+---+--------------------+--------------------+---------------------+--------------------+-------------------+\n",
      "|  0|20200930T235749_2...|2020-10-01T00:00:...|            93.659861|POLYGON ((154.925...|2020-10-01 00:00:09|\n",
      "|  1|20200930T235749_2...|2020-10-01T00:00:...|            99.919739|POLYGON ((155.851...|2020-10-01 00:00:13|\n",
      "|  2|20200930T235749_2...|2020-10-01T00:00:...|             98.79224|POLYGON ((154.841...|2020-10-01 00:00:08|\n",
      "|  3|20200930T235749_2...|2020-10-01T00:00:...|            88.397203|POLYGON ((156.773...|2020-10-01 00:00:13|\n",
      "|  4|20200930T235749_2...|2020-10-01T00:00:...|            93.500239|POLYGON ((155.752...|2020-10-01 00:00:05|\n",
      "|  5|20200930T235749_2...|2020-10-01T00:00:...|            88.250346|POLYGON ((156.870...|2020-10-01 00:00:12|\n",
      "|  6|20200930T235749_2...|2020-10-01T00:00:...|            81.360003|POLYGON ((156.237...|2020-10-01 00:00:04|\n",
      "|  7|20201001T000249_2...|2020-10-01T00:09:...|            11.674878|POLYGON ((145.020...|2020-10-01 00:09:07|\n",
      "|  8|20201001T000249_2...|2020-10-01T00:08:...|            32.067495|POLYGON ((145.605...|2020-10-01 00:08:56|\n",
      "|  9|20201001T000249_2...|2020-10-01T00:08:...|            46.908053|POLYGON ((145.917...|2020-10-01 00:08:43|\n",
      "+---+--------------------+--------------------+---------------------+--------------------+-------------------+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_df.show(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "abroad-carnival",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- _c0: string (nullable = true)\n",
      " |-- s2_index: string (nullable = true)\n",
      " |-- timestamp: string (nullable = true)\n",
      " |-- cloudcover_assessment: double (nullable = true)\n",
      " |-- wkt_geo: string (nullable = true)\n",
      " |-- date_date_type: timestamp (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_df.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "julian-decision",
   "metadata": {},
   "outputs": [],
   "source": [
    "# extract spatial information using WKT format footprint\n",
    "sentinel_df.createOrReplaceTempView(\"sentinel_df\")\n",
    "sentinel_sedona = spark.sql(\"select ST_GeomFromWKT(sentinel_df.wkt_geo) as geometry, sentinel_df.s2_index as s2_index, sentinel_df.date_date_type as date_date_type from sentinel_df\")\n",
    "sentinel_sedona.createOrReplaceTempView(\"sentinel_sedona\")\n",
    "sentinel_sedona = sentinel_sedona.withColumnRenamed('date_date_type', 'Sentinel_date_date_type')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "juvenile-diagnosis",
   "metadata": {},
   "outputs": [],
   "source": [
    "# sentinel_sedona.createOrReplaceTempView(\"sentinel_sedona\")\n",
    "# sentinel_sedona = spark.sql('select ST_Envelope(sentinel_sedona.geometry) as envelope, * from sentinel_sedona')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "proprietary-landing",
   "metadata": {},
   "outputs": [],
   "source": [
    "# sentinel_sedona.createOrReplaceTempView(\"sentinel_sedona\")\n",
    "# sentinel_sedona = spark.sql('select ST_Y(sentinel_sedona.geometry) as test_Y, * from sentinel_sedona')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "willing-namibia",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_sentinel_pd = sentinel_sedona.limit(5).toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "whole-taylor",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_sentinel_pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "seventh-interim",
   "metadata": {},
   "outputs": [],
   "source": [
    "# str(test_sentinel_pd.loc[0, 'envelope'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "balanced-command",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- geometry: geometry (nullable = false)\n",
      " |-- s2_index: string (nullable = true)\n",
      " |-- Sentinel_date_date_type: timestamp (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "reliable-memorial",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+-----------------------+\n",
      "|            geometry|            s2_index|Sentinel_date_date_type|\n",
      "+--------------------+--------------------+-----------------------+\n",
      "|POLYGON ((177.977...|20201001T002611_2...|    2020-10-01 00:27:04|\n",
      "|POLYGON ((-179.62...|20201001T002611_2...|    2020-10-01 00:26:30|\n",
      "|POLYGON ((169.906...|20201001T002611_2...|    2020-10-01 00:27:49|\n",
      "|POLYGON ((170.004...|20201001T002611_2...|    2020-10-01 00:27:35|\n",
      "|POLYGON ((170.421...|20201001T002611_2...|    2020-10-01 00:27:24|\n",
      "|POLYGON ((169.867...|20201001T002611_2...|    2020-10-01 00:27:56|\n",
      "|POLYGON ((168.665...|20201001T002611_2...|    2020-10-01 00:27:54|\n",
      "|POLYGON ((171.228...|20201001T002611_2...|    2020-10-01 00:27:46|\n",
      "|POLYGON ((168.977...|20201001T002611_2...|    2020-10-01 00:27:31|\n",
      "|POLYGON ((169.983...|20201001T002611_2...|    2020-10-01 00:27:18|\n",
      "|POLYGON ((170.999...|20201001T002611_2...|    2020-10-01 00:27:40|\n",
      "|POLYGON ((170.999...|20201001T002611_2...|    2020-10-01 00:27:26|\n",
      "|POLYGON ((170.999...|20201001T002611_2...|    2020-10-01 00:27:12|\n",
      "|POLYGON ((171.076...|20201001T002611_2...|    2020-10-01 00:26:58|\n",
      "|POLYGON ((173.546...|20201001T002611_2...|    2020-10-01 00:27:20|\n",
      "|POLYGON ((173.656...|20201001T002611_2...|    2020-10-01 00:27:06|\n",
      "|POLYGON ((173.778...|20201001T002611_2...|    2020-10-01 00:26:52|\n",
      "|POLYGON ((177.258...|20201001T002611_2...|    2020-10-01 00:27:03|\n",
      "|POLYGON ((178.511...|20201001T002611_2...|    2020-10-01 00:27:08|\n",
      "|POLYGON ((176.999...|20201001T002611_2...|    2020-10-01 00:26:30|\n",
      "+--------------------+--------------------+-----------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "classified-weekend",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+-----------------------+\n",
      "|            geometry|            s2_index|Sentinel_date_date_type|\n",
      "+--------------------+--------------------+-----------------------+\n",
      "|POLYGON ((177.977...|20201001T002611_2...|    2020-10-01 00:27:04|\n",
      "|POLYGON ((-179.62...|20201001T002611_2...|    2020-10-01 00:26:30|\n",
      "|POLYGON ((169.906...|20201001T002611_2...|    2020-10-01 00:27:49|\n",
      "|POLYGON ((170.004...|20201001T002611_2...|    2020-10-01 00:27:35|\n",
      "|POLYGON ((170.421...|20201001T002611_2...|    2020-10-01 00:27:24|\n",
      "|POLYGON ((169.867...|20201001T002611_2...|    2020-10-01 00:27:56|\n",
      "|POLYGON ((168.665...|20201001T002611_2...|    2020-10-01 00:27:54|\n",
      "|POLYGON ((171.228...|20201001T002611_2...|    2020-10-01 00:27:46|\n",
      "|POLYGON ((168.977...|20201001T002611_2...|    2020-10-01 00:27:31|\n",
      "|POLYGON ((169.983...|20201001T002611_2...|    2020-10-01 00:27:18|\n",
      "+--------------------+--------------------+-----------------------+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Extract data within potential time slots\n",
    "\n",
    "sentinel_sedona = sentinel_sedona.filter(F.unix_timestamp(\"Sentinel_date_date_type\") <= end_timestamp).filter(F.unix_timestamp(\"Sentinel_date_date_type\") >= start_timestamp)\n",
    "sentinel_sedona.createOrReplaceTempView(\"sentinel_sedona\")\n",
    "\n",
    "sentinel_sedona.show(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "located-traffic",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+-----------------------+----+----------+\n",
      "|            geometry|            s2_index|Sentinel_date_date_type|hour|      date|\n",
      "+--------------------+--------------------+-----------------------+----+----------+\n",
      "|POLYGON ((177.977...|20201001T002611_2...|    2020-10-01 00:27:04|   0|2020-10-01|\n",
      "|POLYGON ((-179.62...|20201001T002611_2...|    2020-10-01 00:26:30|   0|2020-10-01|\n",
      "|POLYGON ((169.906...|20201001T002611_2...|    2020-10-01 00:27:49|   0|2020-10-01|\n",
      "|POLYGON ((170.004...|20201001T002611_2...|    2020-10-01 00:27:35|   0|2020-10-01|\n",
      "|POLYGON ((170.421...|20201001T002611_2...|    2020-10-01 00:27:24|   0|2020-10-01|\n",
      "|POLYGON ((169.867...|20201001T002611_2...|    2020-10-01 00:27:56|   0|2020-10-01|\n",
      "|POLYGON ((168.665...|20201001T002611_2...|    2020-10-01 00:27:54|   0|2020-10-01|\n",
      "|POLYGON ((171.228...|20201001T002611_2...|    2020-10-01 00:27:46|   0|2020-10-01|\n",
      "|POLYGON ((168.977...|20201001T002611_2...|    2020-10-01 00:27:31|   0|2020-10-01|\n",
      "|POLYGON ((169.983...|20201001T002611_2...|    2020-10-01 00:27:18|   0|2020-10-01|\n",
      "+--------------------+--------------------+-----------------------+----+----------+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona = sentinel_sedona.withColumn('hour', F.hour(F.col('Sentinel_date_date_type')))\n",
    "sentinel_sedona = sentinel_sedona.withColumn('date', F.to_date(F.col('Sentinel_date_date_type')))\n",
    "\n",
    "sentinel_sedona.createOrReplaceTempView(\"sentinel_sedona\")\n",
    "\n",
    "sentinel_sedona.show(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "simple-trinidad",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1601524800.0"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# try to generate a temporal index\n",
    "\n",
    "# find start timestamp\n",
    "import datetime\n",
    "import time\n",
    " \n",
    "# assigned regular string date\n",
    "start_date_datetype = datetime.datetime(2020, 10, 1, 0, 0)\n",
    "# print(\"date_time =>\",date_time)\n",
    " \n",
    "start_date_timestamp = time.mktime(start_date_datetype.timetuple())\n",
    "start_date_timestamp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "congressional-bowling",
   "metadata": {},
   "outputs": [],
   "source": [
    "# get time_duration and hour_delay into seconds for comparison\n",
    "time_index_hour_length_second = time_index_hour_length * 3600\n",
    "hour_delay_second = hour_delay * 3600"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "lasting-stopping",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "86400"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "time_index_hour_length_second"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "raised-condition",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_time_index = sentinel_sedona.withColumn('time_index', F.floor((F.unix_timestamp('Sentinel_date_date_type') - F.lit(start_date_timestamp)) / F.lit(time_index_hour_length_second)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "rural-accident",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+-----------------------+----+----------+----------+\n",
      "|            geometry|            s2_index|Sentinel_date_date_type|hour|      date|time_index|\n",
      "+--------------------+--------------------+-----------------------+----+----------+----------+\n",
      "|POLYGON ((177.977...|20201001T002611_2...|    2020-10-01 00:27:04|   0|2020-10-01|         0|\n",
      "|POLYGON ((-179.62...|20201001T002611_2...|    2020-10-01 00:26:30|   0|2020-10-01|         0|\n",
      "|POLYGON ((169.906...|20201001T002611_2...|    2020-10-01 00:27:49|   0|2020-10-01|         0|\n",
      "|POLYGON ((170.004...|20201001T002611_2...|    2020-10-01 00:27:35|   0|2020-10-01|         0|\n",
      "|POLYGON ((170.421...|20201001T002611_2...|    2020-10-01 00:27:24|   0|2020-10-01|         0|\n",
      "|POLYGON ((169.867...|20201001T002611_2...|    2020-10-01 00:27:56|   0|2020-10-01|         0|\n",
      "|POLYGON ((168.665...|20201001T002611_2...|    2020-10-01 00:27:54|   0|2020-10-01|         0|\n",
      "|POLYGON ((171.228...|20201001T002611_2...|    2020-10-01 00:27:46|   0|2020-10-01|         0|\n",
      "|POLYGON ((168.977...|20201001T002611_2...|    2020-10-01 00:27:31|   0|2020-10-01|         0|\n",
      "|POLYGON ((169.983...|20201001T002611_2...|    2020-10-01 00:27:18|   0|2020-10-01|         0|\n",
      "|POLYGON ((170.999...|20201001T002611_2...|    2020-10-01 00:27:40|   0|2020-10-01|         0|\n",
      "|POLYGON ((170.999...|20201001T002611_2...|    2020-10-01 00:27:26|   0|2020-10-01|         0|\n",
      "|POLYGON ((170.999...|20201001T002611_2...|    2020-10-01 00:27:12|   0|2020-10-01|         0|\n",
      "|POLYGON ((171.076...|20201001T002611_2...|    2020-10-01 00:26:58|   0|2020-10-01|         0|\n",
      "|POLYGON ((173.546...|20201001T002611_2...|    2020-10-01 00:27:20|   0|2020-10-01|         0|\n",
      "|POLYGON ((173.656...|20201001T002611_2...|    2020-10-01 00:27:06|   0|2020-10-01|         0|\n",
      "|POLYGON ((173.778...|20201001T002611_2...|    2020-10-01 00:26:52|   0|2020-10-01|         0|\n",
      "|POLYGON ((177.258...|20201001T002611_2...|    2020-10-01 00:27:03|   0|2020-10-01|         0|\n",
      "|POLYGON ((178.511...|20201001T002611_2...|    2020-10-01 00:27:08|   0|2020-10-01|         0|\n",
      "|POLYGON ((176.999...|20201001T002611_2...|    2020-10-01 00:26:30|   0|2020-10-01|         0|\n",
      "+--------------------+--------------------+-----------------------+----+----------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona_time_index.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "victorian-briefs",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_time_index_early = sentinel_sedona_time_index.filter(((F.unix_timestamp('Sentinel_date_date_type') - F.lit(start_date_timestamp)) % F.lit(time_index_hour_length_second)) <= hour_delay_second)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "monetary-anatomy",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+-----------------------+----+----------+----------+\n",
      "|            geometry|            s2_index|Sentinel_date_date_type|hour|      date|time_index|\n",
      "+--------------------+--------------------+-----------------------+----+----------+----------+\n",
      "|POLYGON ((177.977...|20201001T002611_2...|    2020-10-01 00:27:04|   0|2020-10-01|         0|\n",
      "|POLYGON ((-179.62...|20201001T002611_2...|    2020-10-01 00:26:30|   0|2020-10-01|         0|\n",
      "|POLYGON ((169.906...|20201001T002611_2...|    2020-10-01 00:27:49|   0|2020-10-01|         0|\n",
      "|POLYGON ((170.004...|20201001T002611_2...|    2020-10-01 00:27:35|   0|2020-10-01|         0|\n",
      "|POLYGON ((170.421...|20201001T002611_2...|    2020-10-01 00:27:24|   0|2020-10-01|         0|\n",
      "|POLYGON ((169.867...|20201001T002611_2...|    2020-10-01 00:27:56|   0|2020-10-01|         0|\n",
      "|POLYGON ((168.665...|20201001T002611_2...|    2020-10-01 00:27:54|   0|2020-10-01|         0|\n",
      "|POLYGON ((171.228...|20201001T002611_2...|    2020-10-01 00:27:46|   0|2020-10-01|         0|\n",
      "|POLYGON ((168.977...|20201001T002611_2...|    2020-10-01 00:27:31|   0|2020-10-01|         0|\n",
      "|POLYGON ((169.983...|20201001T002611_2...|    2020-10-01 00:27:18|   0|2020-10-01|         0|\n",
      "|POLYGON ((170.999...|20201001T002611_2...|    2020-10-01 00:27:40|   0|2020-10-01|         0|\n",
      "|POLYGON ((170.999...|20201001T002611_2...|    2020-10-01 00:27:26|   0|2020-10-01|         0|\n",
      "|POLYGON ((170.999...|20201001T002611_2...|    2020-10-01 00:27:12|   0|2020-10-01|         0|\n",
      "|POLYGON ((171.076...|20201001T002611_2...|    2020-10-01 00:26:58|   0|2020-10-01|         0|\n",
      "|POLYGON ((173.546...|20201001T002611_2...|    2020-10-01 00:27:20|   0|2020-10-01|         0|\n",
      "|POLYGON ((173.656...|20201001T002611_2...|    2020-10-01 00:27:06|   0|2020-10-01|         0|\n",
      "|POLYGON ((173.778...|20201001T002611_2...|    2020-10-01 00:26:52|   0|2020-10-01|         0|\n",
      "|POLYGON ((177.258...|20201001T002611_2...|    2020-10-01 00:27:03|   0|2020-10-01|         0|\n",
      "|POLYGON ((178.511...|20201001T002611_2...|    2020-10-01 00:27:08|   0|2020-10-01|         0|\n",
      "|POLYGON ((176.999...|20201001T002611_2...|    2020-10-01 00:26:30|   0|2020-10-01|         0|\n",
      "+--------------------+--------------------+-----------------------+----+----------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona_time_index_early.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "seasonal-serve",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "canadian-special",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_time_index_late = sentinel_sedona_time_index.filter(((F.unix_timestamp('Sentinel_date_date_type') - F.lit(start_date_timestamp)) % F.lit(time_index_hour_length_second)) >= (time_index_hour_length_second - hour_delay_second))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "demanding-netscape",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+-----------------------+----+----------+----------+\n",
      "|            geometry|            s2_index|Sentinel_date_date_type|hour|      date|time_index|\n",
      "+--------------------+--------------------+-----------------------+----+----------+----------+\n",
      "|POLYGON ((-90.087...|20201001T180059_2...|    2020-10-01 18:03:51|  18|2020-10-01|         0|\n",
      "|POLYGON ((-90.101...|20201001T180059_2...|    2020-10-01 18:03:42|  18|2020-10-01|         0|\n",
      "|POLYGON ((-86.716...|20201001T180059_2...|    2020-10-01 18:03:50|  18|2020-10-01|         0|\n",
      "|POLYGON ((-85.876...|20201001T180059_2...|    2020-10-01 18:03:53|  18|2020-10-01|         0|\n",
      "|POLYGON ((-87.000...|20201001T180059_2...|    2020-10-01 18:03:47|  18|2020-10-01|         0|\n",
      "|POLYGON ((-82.643...|20201001T180059_2...|    2020-10-01 18:03:50|  18|2020-10-01|         0|\n",
      "|POLYGON ((-82.777...|20201001T180059_2...|    2020-10-01 18:03:40|  18|2020-10-01|         0|\n",
      "|POLYGON ((-86.701...|20201001T180059_2...|    2020-10-01 18:03:42|  18|2020-10-01|         0|\n",
      "|POLYGON ((-87.572...|20201001T180059_2...|    2020-10-01 18:03:28|  18|2020-10-01|         0|\n",
      "|POLYGON ((-87.000...|20201001T180059_2...|    2020-10-01 18:03:21|  18|2020-10-01|         0|\n",
      "|POLYGON ((-82.943...|20201001T180059_2...|    2020-10-01 18:02:44|  18|2020-10-01|         0|\n",
      "|POLYGON ((-83.939...|20201001T180059_2...|    2020-10-01 18:03:27|  18|2020-10-01|         0|\n",
      "|POLYGON ((-82.557...|20201001T180059_2...|    2020-10-01 18:03:49|  18|2020-10-01|         0|\n",
      "|POLYGON ((-82.731...|20201001T180059_2...|    2020-10-01 18:03:38|  18|2020-10-01|         0|\n",
      "|POLYGON ((-81.000...|20201001T180059_2...|    2020-10-01 18:03:27|  18|2020-10-01|         0|\n",
      "|POLYGON ((-80.717...|20201001T180059_2...|    2020-10-01 18:03:26|  18|2020-10-01|         0|\n",
      "|POLYGON ((-80.685...|20201001T180059_2...|    2020-10-01 18:03:14|  18|2020-10-01|         0|\n",
      "|POLYGON ((-81.544...|20201001T180059_2...|    2020-10-01 18:02:26|  18|2020-10-01|         0|\n",
      "|POLYGON ((-78.531...|20201001T180059_2...|    2020-10-01 18:03:18|  18|2020-10-01|         0|\n",
      "|POLYGON ((-78.571...|20201001T180059_2...|    2020-10-01 18:02:16|  18|2020-10-01|         0|\n",
      "+--------------------+--------------------+-----------------------+----+----------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona_time_index_late.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "twenty-differential",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "approved-blond",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_dateforjoin = sentinel_sedona_time_index.withColumn('time_index_join', F.col('time_index')).union(sentinel_sedona_time_index_late.withColumn('time_index_join', (F.col('time_index') + 1))).union(sentinel_sedona_time_index_early.withColumn('time_index_join', (F.col('time_index') - 1)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "coupled-moisture",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+-----------------------+----+----------+----------+---------------+\n",
      "|            geometry|            s2_index|Sentinel_date_date_type|hour|      date|time_index|time_index_join|\n",
      "+--------------------+--------------------+-----------------------+----+----------+----------+---------------+\n",
      "|POLYGON ((177.977...|20201001T002611_2...|    2020-10-01 00:27:04|   0|2020-10-01|         0|              0|\n",
      "|POLYGON ((-179.62...|20201001T002611_2...|    2020-10-01 00:26:30|   0|2020-10-01|         0|              0|\n",
      "|POLYGON ((169.906...|20201001T002611_2...|    2020-10-01 00:27:49|   0|2020-10-01|         0|              0|\n",
      "|POLYGON ((170.004...|20201001T002611_2...|    2020-10-01 00:27:35|   0|2020-10-01|         0|              0|\n",
      "|POLYGON ((170.421...|20201001T002611_2...|    2020-10-01 00:27:24|   0|2020-10-01|         0|              0|\n",
      "|POLYGON ((169.867...|20201001T002611_2...|    2020-10-01 00:27:56|   0|2020-10-01|         0|              0|\n",
      "|POLYGON ((168.665...|20201001T002611_2...|    2020-10-01 00:27:54|   0|2020-10-01|         0|              0|\n",
      "|POLYGON ((171.228...|20201001T002611_2...|    2020-10-01 00:27:46|   0|2020-10-01|         0|              0|\n",
      "|POLYGON ((168.977...|20201001T002611_2...|    2020-10-01 00:27:31|   0|2020-10-01|         0|              0|\n",
      "|POLYGON ((169.983...|20201001T002611_2...|    2020-10-01 00:27:18|   0|2020-10-01|         0|              0|\n",
      "|POLYGON ((170.999...|20201001T002611_2...|    2020-10-01 00:27:40|   0|2020-10-01|         0|              0|\n",
      "|POLYGON ((170.999...|20201001T002611_2...|    2020-10-01 00:27:26|   0|2020-10-01|         0|              0|\n",
      "|POLYGON ((170.999...|20201001T002611_2...|    2020-10-01 00:27:12|   0|2020-10-01|         0|              0|\n",
      "|POLYGON ((171.076...|20201001T002611_2...|    2020-10-01 00:26:58|   0|2020-10-01|         0|              0|\n",
      "|POLYGON ((173.546...|20201001T002611_2...|    2020-10-01 00:27:20|   0|2020-10-01|         0|              0|\n",
      "|POLYGON ((173.656...|20201001T002611_2...|    2020-10-01 00:27:06|   0|2020-10-01|         0|              0|\n",
      "|POLYGON ((173.778...|20201001T002611_2...|    2020-10-01 00:26:52|   0|2020-10-01|         0|              0|\n",
      "|POLYGON ((177.258...|20201001T002611_2...|    2020-10-01 00:27:03|   0|2020-10-01|         0|              0|\n",
      "|POLYGON ((178.511...|20201001T002611_2...|    2020-10-01 00:27:08|   0|2020-10-01|         0|              0|\n",
      "|POLYGON ((176.999...|20201001T002611_2...|    2020-10-01 00:26:30|   0|2020-10-01|         0|              0|\n",
      "+--------------------+--------------------+-----------------------+----+----------+----------+---------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona_dateforjoin.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "circular-prospect",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "74630"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentinel_sedona_dateforjoin.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "yellow-richards",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "50604"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentinel_sedona_dateforjoin.drop_duplicates(subset=['s2_index']).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cathedral-rugby",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "limiting-chinese",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "handed-summer",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "mediterranean-general",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "proved-executive",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "official-bolivia",
   "metadata": {},
   "source": [
    "## Third, spatio-temporal join Sentinel dataset and ICE-Sat 2 dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "political-fruit",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3.1 join the two Sedona dataframe according the timestamps, hour level at now, consider time delay"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "welcome-fetish",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_dateforjoin_maxmin = sentinel_sedona_dateforjoin.withColumn('sentinel_max_timestamp', (F.unix_timestamp(\"Sentinel_date_date_type\") + hour_delay * 3600)).\\\n",
    "withColumn('sentinel_min_timestamp', (F.unix_timestamp(\"Sentinel_date_date_type\") - hour_delay * 3600))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "sophisticated-homeless",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+-----------------------+----+----------+----------+---------------+----------------------+----------------------+\n",
      "|            geometry|            s2_index|Sentinel_date_date_type|hour|      date|time_index|time_index_join|sentinel_max_timestamp|sentinel_min_timestamp|\n",
      "+--------------------+--------------------+-----------------------+----+----------+----------+---------------+----------------------+----------------------+\n",
      "|POLYGON ((177.977...|20201001T002611_2...|    2020-10-01 00:27:04|   0|2020-10-01|         0|              0|            1601548024|            1601504824|\n",
      "|POLYGON ((-179.62...|20201001T002611_2...|    2020-10-01 00:26:30|   0|2020-10-01|         0|              0|            1601547990|            1601504790|\n",
      "|POLYGON ((169.906...|20201001T002611_2...|    2020-10-01 00:27:49|   0|2020-10-01|         0|              0|            1601548069|            1601504869|\n",
      "|POLYGON ((170.004...|20201001T002611_2...|    2020-10-01 00:27:35|   0|2020-10-01|         0|              0|            1601548055|            1601504855|\n",
      "|POLYGON ((170.421...|20201001T002611_2...|    2020-10-01 00:27:24|   0|2020-10-01|         0|              0|            1601548044|            1601504844|\n",
      "|POLYGON ((169.867...|20201001T002611_2...|    2020-10-01 00:27:56|   0|2020-10-01|         0|              0|            1601548076|            1601504876|\n",
      "|POLYGON ((168.665...|20201001T002611_2...|    2020-10-01 00:27:54|   0|2020-10-01|         0|              0|            1601548074|            1601504874|\n",
      "|POLYGON ((171.228...|20201001T002611_2...|    2020-10-01 00:27:46|   0|2020-10-01|         0|              0|            1601548066|            1601504866|\n",
      "|POLYGON ((168.977...|20201001T002611_2...|    2020-10-01 00:27:31|   0|2020-10-01|         0|              0|            1601548051|            1601504851|\n",
      "|POLYGON ((169.983...|20201001T002611_2...|    2020-10-01 00:27:18|   0|2020-10-01|         0|              0|            1601548038|            1601504838|\n",
      "|POLYGON ((170.999...|20201001T002611_2...|    2020-10-01 00:27:40|   0|2020-10-01|         0|              0|            1601548060|            1601504860|\n",
      "|POLYGON ((170.999...|20201001T002611_2...|    2020-10-01 00:27:26|   0|2020-10-01|         0|              0|            1601548046|            1601504846|\n",
      "|POLYGON ((170.999...|20201001T002611_2...|    2020-10-01 00:27:12|   0|2020-10-01|         0|              0|            1601548032|            1601504832|\n",
      "|POLYGON ((171.076...|20201001T002611_2...|    2020-10-01 00:26:58|   0|2020-10-01|         0|              0|            1601548018|            1601504818|\n",
      "|POLYGON ((173.546...|20201001T002611_2...|    2020-10-01 00:27:20|   0|2020-10-01|         0|              0|            1601548040|            1601504840|\n",
      "|POLYGON ((173.656...|20201001T002611_2...|    2020-10-01 00:27:06|   0|2020-10-01|         0|              0|            1601548026|            1601504826|\n",
      "|POLYGON ((173.778...|20201001T002611_2...|    2020-10-01 00:26:52|   0|2020-10-01|         0|              0|            1601548012|            1601504812|\n",
      "|POLYGON ((177.258...|20201001T002611_2...|    2020-10-01 00:27:03|   0|2020-10-01|         0|              0|            1601548023|            1601504823|\n",
      "|POLYGON ((178.511...|20201001T002611_2...|    2020-10-01 00:27:08|   0|2020-10-01|         0|              0|            1601548028|            1601504828|\n",
      "|POLYGON ((176.999...|20201001T002611_2...|    2020-10-01 00:26:30|   0|2020-10-01|         0|              0|            1601547990|            1601504790|\n",
      "+--------------------+--------------------+-----------------------+----+----------+----------+---------------+----------------------+----------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona_dateforjoin_maxmin.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "rough-aquarium",
   "metadata": {},
   "outputs": [],
   "source": [
    "# sentinel_sedona_dateforjoin_maxmin"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "later-hundred",
   "metadata": {},
   "outputs": [],
   "source": [
    "# is2_df.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "commercial-rapid",
   "metadata": {},
   "outputs": [],
   "source": [
    "# is2_df = is2_df.withColumn('time_index_join', F.floor((F.unix_timestamp('date_date_type') - F.lit(start_date_timestamp)) / F.lit(time_index_hour_length_second)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "dated-hamburg",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+-----------------------+----+----------+----------+\n",
      "|            geometry|            s2_index|Sentinel_date_date_type|hour|      date|time_index|\n",
      "+--------------------+--------------------+-----------------------+----+----------+----------+\n",
      "|POLYGON ((177.977...|20201001T002611_2...|    2020-10-01 00:27:04|   0|2020-10-01|         0|\n",
      "|POLYGON ((-179.62...|20201001T002611_2...|    2020-10-01 00:26:30|   0|2020-10-01|         0|\n",
      "|POLYGON ((169.906...|20201001T002611_2...|    2020-10-01 00:27:49|   0|2020-10-01|         0|\n",
      "|POLYGON ((170.004...|20201001T002611_2...|    2020-10-01 00:27:35|   0|2020-10-01|         0|\n",
      "|POLYGON ((170.421...|20201001T002611_2...|    2020-10-01 00:27:24|   0|2020-10-01|         0|\n",
      "|POLYGON ((169.867...|20201001T002611_2...|    2020-10-01 00:27:56|   0|2020-10-01|         0|\n",
      "|POLYGON ((168.665...|20201001T002611_2...|    2020-10-01 00:27:54|   0|2020-10-01|         0|\n",
      "|POLYGON ((171.228...|20201001T002611_2...|    2020-10-01 00:27:46|   0|2020-10-01|         0|\n",
      "|POLYGON ((168.977...|20201001T002611_2...|    2020-10-01 00:27:31|   0|2020-10-01|         0|\n",
      "|POLYGON ((169.983...|20201001T002611_2...|    2020-10-01 00:27:18|   0|2020-10-01|         0|\n",
      "|POLYGON ((170.999...|20201001T002611_2...|    2020-10-01 00:27:40|   0|2020-10-01|         0|\n",
      "|POLYGON ((170.999...|20201001T002611_2...|    2020-10-01 00:27:26|   0|2020-10-01|         0|\n",
      "|POLYGON ((170.999...|20201001T002611_2...|    2020-10-01 00:27:12|   0|2020-10-01|         0|\n",
      "|POLYGON ((171.076...|20201001T002611_2...|    2020-10-01 00:26:58|   0|2020-10-01|         0|\n",
      "|POLYGON ((173.546...|20201001T002611_2...|    2020-10-01 00:27:20|   0|2020-10-01|         0|\n",
      "|POLYGON ((173.656...|20201001T002611_2...|    2020-10-01 00:27:06|   0|2020-10-01|         0|\n",
      "|POLYGON ((173.778...|20201001T002611_2...|    2020-10-01 00:26:52|   0|2020-10-01|         0|\n",
      "|POLYGON ((177.258...|20201001T002611_2...|    2020-10-01 00:27:03|   0|2020-10-01|         0|\n",
      "|POLYGON ((178.511...|20201001T002611_2...|    2020-10-01 00:27:08|   0|2020-10-01|         0|\n",
      "|POLYGON ((176.999...|20201001T002611_2...|    2020-10-01 00:26:30|   0|2020-10-01|         0|\n",
      "+--------------------+--------------------+-----------------------+----+----------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona_time_index.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a6976f5e-4803-4f63-a958-e4bf0dc183bc",
   "metadata": {},
   "source": [
    "## copy Sentinel data for self-intersection task"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "compact-grounds",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_time_index_forjoin = sentinel_sedona_time_index.withColumnRenamed('geometry', 's2_geometry').\\\n",
    "withColumnRenamed('s2_index', 's2_index_2').\\\n",
    "withColumnRenamed('Sentinel_date_date_type', 'Sentinel_date_date_type_2').\\\n",
    "withColumnRenamed('hour', 'hour_2').\\\n",
    "withColumnRenamed('time_index', 'time_index_2')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "fantastic-lesbian",
   "metadata": {},
   "outputs": [],
   "source": [
    "# sentinel_sedona_dateforjoin_maxmin = sentinel_sedona_dateforjoin_maxmin.filter(F.col('cloudcover_assessment') < F.lit(50)).filter((F.col('min_lat') > F.lit(60)) | (F.col('max_lat') < F.lit(-60)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "activated-oklahoma",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_dateforjoin_maxmin.createOrReplaceTempView(\"sentinel_sedona_dateforjoin_maxmin\")\n",
    "sentinel_sedona_time_index_forjoin.createOrReplaceTempView(\"sentinel_sedona_time_index_forjoin\")\n",
    "test_temporal_indexed_spatial_join = spark.sql(\"SELECT * FROM sentinel_sedona_dateforjoin_maxmin, sentinel_sedona_time_index_forjoin WHERE ST_Intersects(sentinel_sedona_dateforjoin_maxmin.geometry, sentinel_sedona_time_index_forjoin.s2_geometry) AND sentinel_sedona_dateforjoin_maxmin.time_index_join = sentinel_sedona_time_index_forjoin.time_index_2\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "id": "hidden-communication",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_temporal_indexed_spatial_join.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "second-chinese",
   "metadata": {},
   "outputs": [],
   "source": [
    "# import time\n",
    "# start_spatial_time = time.time()\n",
    "# test_spatial_join.select('uuid', 'Description', 'is_timestamp').write.format('com.databricks.spark.csv')\\\n",
    "#         .mode(\"overwrite\")\\\n",
    "#         .option('header', True)\\\n",
    "#         .save('sentinel_is2_intersect_pure_spatial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "id": "constitutional-rochester",
   "metadata": {},
   "outputs": [],
   "source": [
    "# with open('speed_pure_spatial_join_month.txt', 'w') as f:\n",
    "#     f.write('pure spatial join time:')\n",
    "#     f.write(str(time.time() - start_spatial_time))\n",
    "#     f.write('/n')\n",
    "#     f.write('number of spatial intersections:')\n",
    "#     f.write(str(test_spatial_join.count()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "individual-intensity",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- geometry: geometry (nullable = false)\n",
      " |-- s2_index: string (nullable = true)\n",
      " |-- Sentinel_date_date_type: timestamp (nullable = true)\n",
      " |-- hour: integer (nullable = true)\n",
      " |-- date: date (nullable = true)\n",
      " |-- time_index: long (nullable = true)\n",
      " |-- time_index_join: long (nullable = true)\n",
      " |-- sentinel_max_timestamp: long (nullable = true)\n",
      " |-- sentinel_min_timestamp: long (nullable = true)\n",
      " |-- s2_geometry: geometry (nullable = false)\n",
      " |-- s2_index_2: string (nullable = true)\n",
      " |-- Sentinel_date_date_type_2: timestamp (nullable = true)\n",
      " |-- hour_2: integer (nullable = true)\n",
      " |-- date: date (nullable = true)\n",
      " |-- time_index_2: long (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "test_temporal_indexed_spatial_join.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "opposite-burst",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_temporal_indexed_spatial_join = test_temporal_indexed_spatial_join.cache()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "toxic-temperature",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_ST_join = test_temporal_indexed_spatial_join.filter(F.col('sentinel_max_timestamp') >= F.unix_timestamp(\"Sentinel_date_date_type_2\")).filter(F.col('sentinel_min_timestamp') <= F.unix_timestamp(\"Sentinel_date_date_type_2\")).filter(F.col('s2_index') != F.col(\"s2_index_2\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "id": "wireless-vocabulary",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_ST_join.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "id": "tired-illustration",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "start_temporal_time = time.time()\n",
    "test_ST_join.select('s2_index', 's2_index_2', 'Sentinel_date_date_type', 'Sentinel_date_date_type_2').drop_duplicates(subset=['s2_index', 's2_index_2']).write.format('com.databricks.spark.csv')\\\n",
    "        .mode(\"overwrite\")\\\n",
    "        .option('header', True)\\\n",
    "        .save('sentinel_is2_self_join')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "id": "substantial-documentary",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('STSI_{}_months_{}_hours_delay_S2_self_intersect.txt'.format(duration_month, hour_delay), 'w') as f:\n",
    "    f.write('temporal_first join time in total:')\n",
    "    f.write(str(time.time() - start_temporal_time))\n",
    "    f.write('\\n')\n",
    "    f.write('number of spatial intersections:')\n",
    "    f.write(str(test_ST_join.count()))\n",
    "    f.write('\\n')\n",
    "    f.write('Sentinel 2 data count:')\n",
    "    f.write(str(sentinel_sedona.count()))\n",
    "    f.write('\\n')\n",
    "    f.write('hour delay:')\n",
    "    f.write(str(hour_delay))\n",
    "    f.write('\\n')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
